rm(list=ls())
require(quanteda)
require(LSX)
quanteda_options(threads = 8)
source("functions.R")

toks <- readRDS("data_tokens_ja.RDS") %>% 
  tokens_remove('^[\\p{N}\\p{P}]+$', valuetype = 'regex')

target <- c("政治*", "政府*")
poli <- tokens_remove(toks, "^[ぁ-ん]$", valuetype = 'regex') %>% 
        char_keyness(target, window = 5, p = 0.001)

dfmt <- dfm(toks, remove = "") %>% 
  dfm_trim(min_termfreq = 10)

seedwords <- dictionary(file = "seedwords.yml")
seed <- as.seedwords(seedwords$ja)
lss <- textmodel_lss(dfmt, seed, head(poli, 2000), cache = TRUE, 
                     include_data = TRUE, k= 400, slice = 300)

head(coef(lss), 30)
tail(coef(lss), 30)

# benchmarking -------------------------

toks_mnu <- readRDS("data_tokens_manual_ja.RDS")
dfmt_mnu <- dfm(toks_mnu, remove = "")

dat <- docvars(dfmt_mnu)
dat$lss <- predict(lss, newdat = dfmt_mnu)

saveRDS(lss, "lss_ja.RDS")
saveRDS(dat, "data_lss_ja.RDS")

# simulation ---------------------------------

dat_sim <- data.frame()
for (k in seq(100, 400, 1)) {
  model <- textmodel_lss(dfmt, as.seedwords(seedwords$ja),
                         head(poli, 2000), cache = TRUE, k = 400, slice = k)
  pred <- predict(model, newdata = dfmt_mnu, rescaling = FALSE)
  agg <- aggregate(list(lss = pred, 
                        human = dfmt_mnu$human), 
                   by = list(year = dfmt_mnu$year), FUN = mean, na.rm = TRUE)
  dat_sim <- rbind(dat_sim, 
                   data.frame(k = k, 
                              r1 = cor(pred, dfmt_mnu$human, use = "pair"),
                              r2 = cor(agg$lss, agg$human, use = "pair")))
  matplot(dat_sim[,2:3], type = "b", xaxt = "n")
  axis(1, seq_len(nrow(dat_sim)), dat_sim$k)
  print(tail(dat_sim, 1))
}
saveRDS(dat_sim, "data_simulation_ja.RDS")
saveRDS(model, "lss_simulation_ja.RDS")


# resampling ---------------------------------

dat_smp <- data.frame()
docid <- unique(dfmt$docid)
for (p in seq(5, 100, 5)) {
  for (i in seq(20)) {
    docid_sample <- sample(docid, (p / 100) * length(docid))
    model <- dfm_subset(dfmt, docid %in% docid_sample) %>% 
      textmodel_lss(as.seedwords(seedwords$ja),
                    head(poli, 2000), k = 300)
    pred <- predict(model, newdata = dfmt_mnu, rescaling = FALSE)
    agg <- aggregate(list(lss = pred, 
                          human = dfmt_mnu$human), 
                     by = list(year = dfmt_mnu$year), FUN = mean, na.rm = TRUE)
    dat_smp <- rbind(dat_smp, 
                     data.frame(p = p, i = i,
                                r1 = cor(pred, dfmt_mnu$human, use = "pair"),
                                r2 = cor(agg$lss, agg$human, use = "pair")))
    plot(dat_smp[,1], dat_smp[,4], type = "p")
    print(tail(dat_smp, 1))
  }
}
saveRDS(dat_smp, "data_bootstrap_ja.RDS")
